pubchem.endpoint Schema Extraction¶

This notebook demonstrates RDF schema extraction from the pubchem.endpoint SPARQL endpoint. It discovers VoID (Vocabulary of Interlinked Datasets) descriptions and generates JSON-LD as the source for all downstream outputs including frequency analysis and LinkML schemas.

Exports¶

  • JSON-LD Schema (primary output)
  • N-Quads RDF
  • VoID Graph for the dataset in its original source
  • Coverage report
  • LinkML Schema
  • Full parquet entity dataframe
In [1]:
# Dataset Configuration
import os

# Dataset parameters
endpoint_url = "https://idsm.elixir-czech.cz/sparql/endpoint/idsm"
dataset_name = "pubchem.endpoint"
void_iri = "http://rdf.ncbi.nlm.nih.gov/pubchem/endpoint"
graph_uri = "http://rdf.ncbi.nlm.nih.gov/pubchem/endpoint"

# Setup paths
working_path = os.path.abspath("")
exports_path = os.path.join(
    working_path, "..", "..", "docs", "data", "schema_extraction", dataset_name
)
os.makedirs(exports_path, exist_ok=True)
In [2]:
import logging
import sys

# Minimal notebook logger using existing dataset_name
logger = logging.getLogger(dataset_name or "notebook")
logger.setLevel(logging.DEBUG)  # Set to DEBUG to see SPARQL queries

# Also configure the rdfsolve.parser logger to see query details
parser_logger = logging.getLogger("rdfsolve.parser")
parser_logger.setLevel(logging.DEBUG)

# Avoid adding duplicate handlers if the cell is re-run
if not logger.handlers:
    fmt = logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s", "%Y-%m-%d %H:%M:%S")

    sh = logging.StreamHandler(sys.stdout)
    sh.setLevel(logging.DEBUG)  # Set to DEBUG to see all logs
    sh.setFormatter(fmt)
    logger.addHandler(sh)

    # Add the same handler to the parser logger
    parser_logger.addHandler(sh)

logger.info(f"Logging configured for {dataset_name}")
2025-12-02 10:00:56 INFO pubchem.endpoint: Logging configured for pubchem.endpoint
In [3]:
# Import libraries
import json

# Configure Plotly for HTML output
import plotly.io as pio
import plotly.offline as pyo
from IPython.display import Markdown, display

# Import rdfsolve API functions
from rdfsolve.api import (
    discover_void_graphs,
    generate_void_from_endpoint,
    load_parser_from_graph,
    retrieve_void_from_graphs,
)
from rdfsolve.sparql_helper import SparqlHelper

# Enable query collection to track all SPARQL queries executed
SparqlHelper.enable_query_collection()

# Set renderer to 'notebook' for Jupyter, but ensure HTML export works
pio.renderers.default = "notebook+plotly_mimetype"

# Initialize offline mode for Plotly
pyo.init_notebook_mode(connected=True)
In [4]:
# Pickle caching utilities
import os
import pickle


def save_cache(data, filename, cache_dir=None):
    """Save data to pickle cache."""
    if cache_dir is None:
        cache_dir = os.path.join(exports_path, "cache")
    os.makedirs(cache_dir, exist_ok=True)

    cache_path = os.path.join(cache_dir, f"{filename}.pkl")
    with open(cache_path, "wb") as f:
        pickle.dump(data, f)
    print(f"Cached data to: {cache_path}")
    return cache_path


def load_cache(filename, cache_dir=None):
    """Load data from pickle cache if it exists."""
    if cache_dir is None:
        cache_dir = os.path.join(exports_path, "cache")

    cache_path = os.path.join(cache_dir, f"{filename}.pkl")
    if os.path.exists(cache_path):
        with open(cache_path, "rb") as f:
            data = pickle.load(f)
        print(f"Loaded cached data from: {cache_path}")
        return data
    return None


def cache_exists(filename, cache_dir=None):
    """Check if cache file exists."""
    if cache_dir is None:
        cache_dir = os.path.join(exports_path, "cache")

    cache_path = os.path.join(cache_dir, f"{filename}.pkl")
    return os.path.exists(cache_path)
In [5]:
# Cache management utilities
def list_cache_files(cache_dir=None):
    """List all cache files."""
    if cache_dir is None:
        cache_dir = os.path.join(exports_path, "cache")

    if not os.path.exists(cache_dir):
        print("No cache directory found")
        return []

    cache_files = [f for f in os.listdir(cache_dir) if f.endswith(".pkl")]
    print(f"Cache directory: {cache_dir}")
    for f in cache_files:
        file_path = os.path.join(cache_dir, f)
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"  {f} ({size_mb:.2f} MB)")
    return cache_files


def clear_cache(filename=None, cache_dir=None):
    """Clear specific cache file or all cache."""
    if cache_dir is None:
        cache_dir = os.path.join(exports_path, "cache")

    if filename:
        cache_path = os.path.join(cache_dir, f"{filename}.pkl")
        if os.path.exists(cache_path):
            os.remove(cache_path)
            print(f"Removed cache: {filename}")
        else:
            print(f"Cache not found: {filename}")
    else:
        # Clear all cache files
        if os.path.exists(cache_dir):
            import shutil

            shutil.rmtree(cache_dir)
            print("Cleared all cache files")
        else:
            print("No cache directory to clear")


# Show current cache status
list_cache_files()
No cache directory found
Out[5]:
[]

Cache Control¶

Use these cells to manage cached data. When testing new code changes, you may want to clear relevant cache files to force re-computation.

In [6]:
# Clear specific cache files (uncomment lines as needed for testing)

# When testing new VoID discovery/generation:
# clear_cache(f"{dataset_name}_voidgraph")

# When testing JSON-LD generation (primary output):
# clear_cache(f"{dataset_name}_jsonld_schema")

# When testing frequency calculations:
# clear_cache(f"{dataset_name}_frequencies_basic")
# clear_cache(f"{dataset_name}_frequencies_with_instances")

# Clear everything:
clear_cache()

print("Cache control ready")
print("Note: VoID graph and JSON-LD are the primary caches")
No cache directory to clear
Cache control ready
Note: VoID graph and JSON-LD are the primary caches

Discover or get VoID Schema¶

In [7]:
# Discover or generate VoID schema with caching
cache_key = f"{dataset_name}_voidgraph"

# Try to load from cache first
void_graph = load_cache(cache_key)

if void_graph is None:
    print("VoID graph not found in cache, attempting discovery...")

    # Step 1: Try to discover existing VoID graphs
    discovery_result = discover_void_graphs(
        endpoint_url, graph_uris=[graph_uri] if graph_uri else None
    )

    found_graphs = discovery_result.get("found_graphs", [])
    partitions = discovery_result.get("partitions", [])

    if found_graphs and partitions:
        print(f"Found {len(found_graphs)} VoID graphs with {len(partitions)} partitions")
        # Build VoID graph directly from partition data (no CONSTRUCT query needed)
        void_graph = retrieve_void_from_graphs(
            endpoint_url,
            found_graphs,
            graph_uris=[graph_uri] if graph_uri else None,
            partitions=partitions,  # Pass partition data directly
        )

        # Save to file
        void_path = os.path.join(exports_path, f"{dataset_name}_existing_void.ttl")
        void_graph.serialize(destination=void_path, format="turtle")
        print(f"Built VoID graph from: {', '.join(found_graphs)}")
    else:
        print("No VoID graphs found, generating from queries...")
        # Step 2: Generate new VoID if none found
        void_graph = generate_void_from_endpoint(
            endpoint_url=endpoint_url,
            graph_uris=[graph_uri] if graph_uri else None,
            output_file=os.path.join(exports_path, f"{dataset_name}_generated_void.ttl"),
            counts=True,
            offset_limit_steps=300,
            exclude_graphs=True,
        )

    # Cache the VoID graph for future use
    save_cache(void_graph, cache_key)
    print(f"VoID graph cached with {len(void_graph)} triples")
else:
    print(f"Loaded VoID graph from cache ({len(void_graph)} triples)")

# Load parser from the VoID graph
vp = load_parser_from_graph(void_graph, graph_uris=[graph_uri] if graph_uri else None)
VoID graph not found in cache, attempting discovery...
2025-12-02 10:00:58 DEBUG rdfsolve.parser: Starting VoID partition discovery for https://idsm.elixir-czech.cz/sparql/endpoint/idsm
2025-12-02 10:00:58 INFO rdfsolve.parser: Discovering VoID partitions across all graphs
Query attempt 1/3 failed: 500 Server Error: 500 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++PREFIX+void%3A+%3Chttp%3A%2F%2Frdfs.org%2Fns%2Fvoid%23%3E%0A++++++++PREFIX+void-ext%3A+%3Chttp%3A%2F%2Fldf.fi%2Fvoid-ext%23%3E%0A++++++++SELECT+DISTINCT+%3FsubjectClass+%3Fprop+%3FobjectClass+%3FobjectDatatype+%3Fg%0A++++++++WHERE+%7B%0A++++++++++GRAPH+%3Fg+%7B%0A++++++++++++%7B%0A++++++++++++++%3Fcp+void%3Aclass+%3FsubjectClass+%3B%0A++++++++++++++++++void%3ApropertyPartition+%3Fpp+.%0A++++++++++++++%3Fpp+void%3Aproperty+%3Fprop+.%0A++++++++++++++OPTIONAL+%7B%0A++++++++++++++++++%7B%0A++++++++++++++++++++++%3Fpp++void%3AclassPartition+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++++++++%7D+UNION+%7B%0A++++++++++++++++++++++%3Fpp+void-ext%3AdatatypePartition+%5B+void-ext%3Adatatype+%3FobjectDatatype+%5D+.%0A++++++++++++++++++%7D%0A++++++++++++++%7D%0A++++++++++++%7D+UNION+%7B%0A++++++++++++++%3Fls+void%3AsubjectsTarget+%5B+void%3Aclass+%3FsubjectClass+%5D+%3B%0A++++++++++++++++++void%3AlinkPredicate+%3Fprop+%3B%0A++++++++++++++++++void%3AobjectsTarget+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++%7D%0A++++++++++%7D%0A++++++++%7D%0A++++++++
Query attempt 2/3 failed: 500 Server Error: 500 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++PREFIX+void%3A+%3Chttp%3A%2F%2Frdfs.org%2Fns%2Fvoid%23%3E%0A++++++++PREFIX+void-ext%3A+%3Chttp%3A%2F%2Fldf.fi%2Fvoid-ext%23%3E%0A++++++++SELECT+DISTINCT+%3FsubjectClass+%3Fprop+%3FobjectClass+%3FobjectDatatype+%3Fg%0A++++++++WHERE+%7B%0A++++++++++GRAPH+%3Fg+%7B%0A++++++++++++%7B%0A++++++++++++++%3Fcp+void%3Aclass+%3FsubjectClass+%3B%0A++++++++++++++++++void%3ApropertyPartition+%3Fpp+.%0A++++++++++++++%3Fpp+void%3Aproperty+%3Fprop+.%0A++++++++++++++OPTIONAL+%7B%0A++++++++++++++++++%7B%0A++++++++++++++++++++++%3Fpp++void%3AclassPartition+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++++++++%7D+UNION+%7B%0A++++++++++++++++++++++%3Fpp+void-ext%3AdatatypePartition+%5B+void-ext%3Adatatype+%3FobjectDatatype+%5D+.%0A++++++++++++++++++%7D%0A++++++++++++++%7D%0A++++++++++++%7D+UNION+%7B%0A++++++++++++++%3Fls+void%3AsubjectsTarget+%5B+void%3Aclass+%3FsubjectClass+%5D+%3B%0A++++++++++++++++++void%3AlinkPredicate+%3Fprop+%3B%0A++++++++++++++++++void%3AobjectsTarget+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++%7D%0A++++++++++%7D%0A++++++++%7D%0A++++++++
Query attempt 3/3 failed: 500 Server Error: 500 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++PREFIX+void%3A+%3Chttp%3A%2F%2Frdfs.org%2Fns%2Fvoid%23%3E%0A++++++++PREFIX+void-ext%3A+%3Chttp%3A%2F%2Fldf.fi%2Fvoid-ext%23%3E%0A++++++++SELECT+DISTINCT+%3FsubjectClass+%3Fprop+%3FobjectClass+%3FobjectDatatype+%3Fg%0A++++++++WHERE+%7B%0A++++++++++GRAPH+%3Fg+%7B%0A++++++++++++%7B%0A++++++++++++++%3Fcp+void%3Aclass+%3FsubjectClass+%3B%0A++++++++++++++++++void%3ApropertyPartition+%3Fpp+.%0A++++++++++++++%3Fpp+void%3Aproperty+%3Fprop+.%0A++++++++++++++OPTIONAL+%7B%0A++++++++++++++++++%7B%0A++++++++++++++++++++++%3Fpp++void%3AclassPartition+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++++++++%7D+UNION+%7B%0A++++++++++++++++++++++%3Fpp+void-ext%3AdatatypePartition+%5B+void-ext%3Adatatype+%3FobjectDatatype+%5D+.%0A++++++++++++++++++%7D%0A++++++++++++++%7D%0A++++++++++++%7D+UNION+%7B%0A++++++++++++++%3Fls+void%3AsubjectsTarget+%5B+void%3Aclass+%3FsubjectClass+%5D+%3B%0A++++++++++++++++++void%3AlinkPredicate+%3Fprop+%3B%0A++++++++++++++++++void%3AobjectsTarget+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++%7D%0A++++++++++%7D%0A++++++++%7D%0A++++++++
SELECT failed after 3 tries
2025-12-02 10:01:03 INFO rdfsolve.parser: VoID discovery failed: Query failed after 3 attempts: 500 Server Error: 500 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++PREFIX+void%3A+%3Chttp%3A%2F%2Frdfs.org%2Fns%2Fvoid%23%3E%0A++++++++PREFIX+void-ext%3A+%3Chttp%3A%2F%2Fldf.fi%2Fvoid-ext%23%3E%0A++++++++SELECT+DISTINCT+%3FsubjectClass+%3Fprop+%3FobjectClass+%3FobjectDatatype+%3Fg%0A++++++++WHERE+%7B%0A++++++++++GRAPH+%3Fg+%7B%0A++++++++++++%7B%0A++++++++++++++%3Fcp+void%3Aclass+%3FsubjectClass+%3B%0A++++++++++++++++++void%3ApropertyPartition+%3Fpp+.%0A++++++++++++++%3Fpp+void%3Aproperty+%3Fprop+.%0A++++++++++++++OPTIONAL+%7B%0A++++++++++++++++++%7B%0A++++++++++++++++++++++%3Fpp++void%3AclassPartition+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++++++++%7D+UNION+%7B%0A++++++++++++++++++++++%3Fpp+void-ext%3AdatatypePartition+%5B+void-ext%3Adatatype+%3FobjectDatatype+%5D+.%0A++++++++++++++++++%7D%0A++++++++++++++%7D%0A++++++++++++%7D+UNION+%7B%0A++++++++++++++%3Fls+void%3AsubjectsTarget+%5B+void%3Aclass+%3FsubjectClass+%5D+%3B%0A++++++++++++++++++void%3AlinkPredicate+%3Fprop+%3B%0A++++++++++++++++++void%3AobjectsTarget+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++%7D%0A++++++++++%7D%0A++++++++%7D%0A++++++++
2025-12-02 10:01:03 DEBUG rdfsolve.parser: Discovery exception: EndpointError: Query failed after 3 attempts: 500 Server Error: 500 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++PREFIX+void%3A+%3Chttp%3A%2F%2Frdfs.org%2Fns%2Fvoid%23%3E%0A++++++++PREFIX+void-ext%3A+%3Chttp%3A%2F%2Fldf.fi%2Fvoid-ext%23%3E%0A++++++++SELECT+DISTINCT+%3FsubjectClass+%3Fprop+%3FobjectClass+%3FobjectDatatype+%3Fg%0A++++++++WHERE+%7B%0A++++++++++GRAPH+%3Fg+%7B%0A++++++++++++%7B%0A++++++++++++++%3Fcp+void%3Aclass+%3FsubjectClass+%3B%0A++++++++++++++++++void%3ApropertyPartition+%3Fpp+.%0A++++++++++++++%3Fpp+void%3Aproperty+%3Fprop+.%0A++++++++++++++OPTIONAL+%7B%0A++++++++++++++++++%7B%0A++++++++++++++++++++++%3Fpp++void%3AclassPartition+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++++++++%7D+UNION+%7B%0A++++++++++++++++++++++%3Fpp+void-ext%3AdatatypePartition+%5B+void-ext%3Adatatype+%3FobjectDatatype+%5D+.%0A++++++++++++++++++%7D%0A++++++++++++++%7D%0A++++++++++++%7D+UNION+%7B%0A++++++++++++++%3Fls+void%3AsubjectsTarget+%5B+void%3Aclass+%3FsubjectClass+%5D+%3B%0A++++++++++++++++++void%3AlinkPredicate+%3Fprop+%3B%0A++++++++++++++++++void%3AobjectsTarget+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++%7D%0A++++++++++%7D%0A++++++++%7D%0A++++++++
No VoID graphs found, generating from queries...
2025-12-02 10:05:15 INFO rdfsolve.parser: Successfully extracted 478 RDF triples
2025-12-02 10:05:15 INFO rdfsolve.parser: VoID description saved to /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.endpoint/pubchem.endpoint_generated_void.ttl
Cached data to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.endpoint/cache/pubchem.endpoint_voidgraph.pkl
VoID graph cached with 478 triples

Schema Discovery and Exports Workflow¶

Workflow Steps:¶

  1. VoID Discovery: Extract schema patterns from SPARQL endpoint VoID descriptions
  2. JSON-LD Generation: Convert to JSON-LD.
  3. Derived Outputs: All other formats are generated from the JSON-LD structure:
    • Frequencies: Schema pattern coverage analysis
    • LinkML: LinkML YAML used elsewhere for other features.
    • CSV/JSON: Tabular and structured data exports
    • RDF: N-Quads serialization for triplestore import
In [8]:
# Primary JSON-LD schema export and basic summary
cache_key = f"{dataset_name}_jsonld_schema"
jsonld_schema = load_cache(cache_key)

if jsonld_schema is None:
    print("Generating JSON-LD schema...")
    jsonld_schema = vp.to_jsonld(filter_void_admin_nodes=True)
    save_cache(jsonld_schema, cache_key)
else:
    print("Loaded JSON-LD schema from cache")

# Save JSON-LD schema file
jsonld_file = os.path.join(exports_path, f"{dataset_name}_schema.jsonld")
with open(jsonld_file, "w", encoding="utf-8") as f:
    json.dump(jsonld_schema, f, indent=2, ensure_ascii=False)

print(f"JSON-LD Schema saved to: {jsonld_file}")

# Display combined JSON-LD structure info and schema summary
if "@graph" in jsonld_schema:
    print("\nSchema Summary:")
    print(f"   • Prefixes: {len(jsonld_schema['@context'])}")
    print(f"   • Resources: {len(jsonld_schema['@graph'])}")

    # Show dataset metadata
    dataset_info = jsonld_schema["@graph"][0] if jsonld_schema["@graph"] else {}
    if dataset_info.get("@type") == "void:Dataset":
        print(f"   • Dataset: {dataset_info.get('dcterms:title', 'Unknown')}")
        print(f"   • Classes: {dataset_info.get('void:classes', 0)}")
        print(f"   • Properties: {dataset_info.get('void:properties', 0)}")
        print(f"   • Triples: {dataset_info.get('void:triples', 0)}")

# Get schema DataFrame and show sample
schema_df = vp.to_schema(filter_void_admin_nodes=True)
print(f"\nSchema Patterns Preview ({len(schema_df)} total):")
display(schema_df.head())
Generating JSON-LD schema...
Cached data to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.endpoint/cache/pubchem.endpoint_jsonld_schema.pkl
JSON-LD Schema saved to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.endpoint/pubchem.endpoint_schema.jsonld

Schema Summary:
   • Prefixes: 8
   • Resources: 28
Schema Patterns Preview (31920 total):
subject_class subject_uri property property_uri object_class object_uri
0 bao:0000034 http://www.bioassayontology.org/bao#BAO_0000034 iao:0000136 http://purl.obolibrary.org/obo/IAO_0000136 chebi:9 http://purl.obolibrary.org/obo/CHEBI_9
1 bao:0000034 http://www.bioassayontology.org/bao#BAO_0000034 iao:0000136 http://purl.obolibrary.org/obo/IAO_0000136 chebi:10 http://purl.obolibrary.org/obo/CHEBI_10
2 bao:0000034 http://www.bioassayontology.org/bao#BAO_0000034 iao:0000136 http://purl.obolibrary.org/obo/IAO_0000136 chebi:17 http://purl.obolibrary.org/obo/CHEBI_17
3 bao:0000034 http://www.bioassayontology.org/bao#BAO_0000034 iao:0000136 http://purl.obolibrary.org/obo/IAO_0000136 chebi:20 http://purl.obolibrary.org/obo/CHEBI_20
4 bao:0000034 http://www.bioassayontology.org/bao#BAO_0000034 iao:0000136 http://purl.obolibrary.org/obo/IAO_0000136 chebi:22 http://purl.obolibrary.org/obo/CHEBI_22

Schema Pattern Coverage Analysis¶

Calculate coverage ratios showing what percentage of entities use each relationship pattern.

In [9]:
# Schema pattern coverage analysis and export
cache_key = f"{dataset_name}_frequencies_basic"
cached_data = load_cache(cache_key)

if cached_data is None:
    print("Calculating schema pattern frequencies...")
    frequencies_df, _ = vp.count_schema_shape_frequencies(
        endpoint_url=endpoint_url,
        offset_limit_steps=300,
    )
    save_cache(frequencies_df, cache_key)
else:
    print("Loaded frequencies DataFrame from cache")
    frequencies_df = cached_data

# Export coverage analysis
frequencies_output_path = os.path.join(exports_path, f"{dataset_name}_pattern_coverage.csv")
exported_df = vp.export_schema_shape_frequencies(
    frequencies_df, output_file=frequencies_output_path
)

# Combined summary and sample
if not frequencies_df.empty:
    avg_coverage = frequencies_df["coverage_percent"].mean()
    high_coverage = (frequencies_df["coverage_percent"] > 50).sum()

    print("\nPattern Coverage Analysis:")
    print(f"   • Total patterns: {len(frequencies_df)}")
    print(f"   • Average coverage: {avg_coverage:.1f}%")
    print(f"   • High coverage (>50%): {high_coverage}")
    print(f"   • Exported to: {frequencies_output_path}")

    print("\nSample Coverage Data:")
    display(
        frequencies_df[["subject_class", "property", "object_class", "coverage_percent"]].head()
    )

    print("\nCoverage Statistics:")
    display(frequencies_df["coverage_percent"].describe())
else:
    print("No frequency data available")
Calculating schema pattern frequencies...
2025-12-02 10:05:29 INFO rdfsolve.parser: Using chunked pagination for entity counts (step size: 300)
INFO:rdfsolve.parser:Using chunked pagination for entity counts (step size: 300)
2025-12-02 10:05:33 DEBUG rdfsolve.parser: Chunked entity count: chunk 1, rows=28, total=28
DEBUG:rdfsolve.parser:Chunked entity count: chunk 1, rows=28, total=28
2025-12-02 10:05:33 INFO rdfsolve.parser: Chunked entity counting complete: 1 chunks, 28 total results
INFO:rdfsolve.parser:Chunked entity counting complete: 1 chunks, 28 total results
Cached data to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.endpoint/cache/pubchem.endpoint_frequencies_basic.pkl
Pattern Coverage Analysis:
   • Total patterns: 31920
   • Average coverage: 0.0%
   • High coverage (>50%): 0
   • Exported to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.endpoint/pubchem.endpoint_pattern_coverage.csv

Sample Coverage Data:
subject_class property object_class coverage_percent
31903 bao:0003036 resource:has-unit chebi:2294 0.0
31902 bao:0003036 resource:has-unit chebi:2268 0.0
31901 bao:0003036 resource:has-unit chebi:2264 0.0
31900 bao:0003036 resource:has-unit chebi:2256 0.0
31899 bao:0003036 resource:has-unit chebi:2255 0.0
Coverage Statistics:
count    31920.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: coverage_percent, dtype: float64

Schema Pattern Instance Collection¶

Collect actual subject and object IRI instances for each schema pattern. This provides detailed access to the specific entities participating in each relationship pattern.

In [10]:
# Collect both frequency data and actual instances with caching
cache_key = f"{dataset_name}_frequencies_with_instances"
cached_data = load_cache(cache_key)

if cached_data is None:
    print("Collecting frequency data and instances...")
    frequencies_with_instances_df, instances_df = vp.count_schema_shape_frequencies(
        endpoint_url=endpoint_url,
        # sample_limit=100,  # Limited sample for demonstration
        collect_instances=True,
        offset_limit_steps=300,
    )
    # Cache both DataFrames as a tuple
    save_cache((frequencies_with_instances_df, instances_df), cache_key)
else:
    print("Loaded frequencies and instances DataFrames from cache")
    frequencies_with_instances_df, instances_df = cached_data

# Display basic information about the data structure
print(f"Frequencies DataFrame: {len(frequencies_with_instances_df)} shapes")
if frequencies_with_instances_df is not None:
    print(
        f"Memory usage - Instances: {frequencies_with_instances_df.memory_usage(deep=True).sum() / 1024:.1f} KB"
    )
else:
    print("No instances collected")
Collecting frequency data and instances...
2025-12-02 11:02:04 INFO rdfsolve.parser: Using chunked pagination for entity counts (step size: 300)
INFO:rdfsolve.parser:Using chunked pagination for entity counts (step size: 300)
2025-12-02 11:02:40 DEBUG rdfsolve.parser: Chunked entity count: chunk 1, rows=28, total=28
DEBUG:rdfsolve.parser:Chunked entity count: chunk 1, rows=28, total=28
2025-12-02 11:02:40 INFO rdfsolve.parser: Chunked entity counting complete: 1 chunks, 28 total results
INFO:rdfsolve.parser:Chunked entity counting complete: 1 chunks, 28 total results
Cached data to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.endpoint/cache/pubchem.endpoint_frequencies_with_instances.pkl
Frequencies DataFrame: 31920 shapes
Memory usage - Instances: 22992.8 KB
In [11]:
import pandas as pd
import plotly.graph_objects as go

if not frequencies_with_instances_df.empty:
    df = frequencies_with_instances_df.copy()
    df["coverage_percent"] = pd.to_numeric(df["coverage_percent"], errors="coerce").fillna(0)
    df = df.sort_values("coverage_percent", ascending=False).reset_index(drop=True)

    def make_label(row):
        return (
            f"<b>{row['subject_class']}</b> "
            f"<span style='color:#888;'></span> "
            f"<i>{row['property']}</i> "
            f"<span style='color:#888;'></span> "
            f"<b>{row['object_class']}</b>"
        )

    df["styled_label"] = df.apply(make_label, axis=1)

    text_positions = ["outside" if v < 95 else "inside" for v in df["coverage_percent"]]
    custom_colorscale = [
        [0.0, "#d36e61"],
        [0.4, "#e5cdbd"],
        [0.7, "#e8e4cf"],
        [1.0, "#c3d9c0"],
    ]

    # Figure sizing
    bar_height = 26
    fig_height = min(2000, bar_height * len(df) + 200)

    fig = go.Figure(
        go.Bar(
            x=df["coverage_percent"],
            y=df["styled_label"],
            orientation="h",
            text=[f"{v:.1f}%" for v in df["coverage_percent"]],
            textposition=text_positions,
            marker={
                "color": df["coverage_percent"],
                "colorscale": custom_colorscale,
                "cmin": 0,
                "cmax": 100,
                "line": {"color": "white", "width": 0.6},
            },
            hovertemplate="<b>%{y}</b><br>Coverage: %{x:.1f}%<extra></extra>",
        )
    )

    fig.update_layout(
        title={
            "text": f"Schema Pattern Coverage for {dataset_name}",
            "x": 0.5,
            "font": {"size": 18},
        },
        xaxis={
            "title": "Coverage (%)",
            "range": [0, 100],  # fixed x-axis range
            "ticksuffix": "%",
            "showgrid": True,
            "gridcolor": "rgba(220,220,220,0.3)",
        },
        yaxis={
            "title": "",
            "autorange": "reversed",
            "automargin": True,
            "fixedrange": False,  # allow vertical zoom/pan
        },
        template="plotly_white",
        autosize=True,  # allow figure to scale with container
        height=fig_height,  # base height (will scale)
        margin={"t": 80, "b": 50, "l": 480, "r": 150},  # extra right margin for text
        plot_bgcolor="white",
        paper_bgcolor="white",
    )

    # Disable horizontal zoom/pan
    fig.update_xaxes(fixedrange=True)

    # Show figure with config for HTML export compatibility
    fig.show(
        config={
            "scrollZoom": True,
            "responsive": True,
            "toImageButtonOptions": {
                "format": "png",
                "filename": f"{dataset_name}_schema_coverage",
                "height": fig_height,
                "width": 600,
                "scale": 1,
            },
        }
    )

else:
    display(Markdown("**No coverage data to visualize**"))

LinkML (derived from JSON-LD)¶

In [12]:
# Generate LinkML directly from JSON-LD with custom schema URI
print("Regenerating LinkML schema from JSON-LD with custom schema URI...")

schema_name = f"{dataset_name}_schema"
custom_schema_uri = (
    f"http://jmillanacosta.github.io/rdfsolve/{dataset_name}/linkml"  # User-definable base URI
)

yaml_text = vp.to_linkml_yaml(
    schema_name=schema_name,
    schema_description=f"LinkML schema for {dataset_name} generated from JSON-LD",
    schema_base_uri=custom_schema_uri,
    filter_void_nodes=True,
)

# Save to LinkML YAML
linkml_file = os.path.join(exports_path, f"{dataset_name}_linkml_schema.yaml")
with open(linkml_file, "w", encoding="utf-8") as f:
    f.write(yaml_text)

print(f"LinkML YAML saved to: {linkml_file}")
Regenerating LinkML schema from JSON-LD with custom schema URI...
LinkML YAML saved to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.endpoint/pubchem.endpoint_linkml_schema.yaml
In [13]:
from linkml.generators.erdiagramgen import ERDiagramGenerator
from linkml_runtime.utils.schemaview import SchemaView

sv = SchemaView(linkml_file)
linkml_schema = sv.schema

display(
    Markdown(
        f"**Parsed LinkML schema:** Classes = {len(sv.all_classes())}, Slots = {len(sv.all_slots())}"
    )
)

# Build and display a Mermaid class diagram for the aopwikirdf LinkedML
mermaid_code = ERDiagramGenerator(linkml_file).serialize()

display(Markdown(mermaid_code))

Parsed LinkML schema: Classes = 29, Slots = 5

erDiagram
Bao0000034 {

}
Bao0000186 {

}
Bao0000187 {

}
Bao0000188 {

}
Bao0000189 {

}
Bao0000190 {

}
Bao0000192 {

}
Bao0000194 {

}
Bao0000349 {

}
Bao0000477 {

}
Bao0002117 {

}
Bao0002144 {

}
Bao0002145 {

}
Bao0002146 {

}
Bao0002162 {

}
Bao0002862 {

}
Bao0002877 {

}
Bao0002878 {

}
Bao0002879 {

}
Bao0002880 {

}
Bao0002881 {

}
Bao0002882 {

}
Bao0002883 {

}
Bao0002884 {

}
Bao0002886 {

}
Bao0002887 {

}
Bao0003036 {

}
Chebi9 {

}
VocabularyEndpoint {

}

Bao0000034 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0000034 ||--|o Chebi9 : "iao_0000136"
Bao0000034 ||--|o Chebi9 : "resource_has_unit"
Bao0000034 ||--|o Chebi9 : "sio_000221"
Bao0000034 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0000186 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0000186 ||--|o Chebi9 : "iao_0000136"
Bao0000186 ||--|o Chebi9 : "resource_has_unit"
Bao0000186 ||--|o Chebi9 : "sio_000221"
Bao0000186 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0000187 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0000187 ||--|o Chebi9 : "iao_0000136"
Bao0000187 ||--|o Chebi9 : "resource_has_unit"
Bao0000187 ||--|o Chebi9 : "sio_000221"
Bao0000187 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0000188 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0000188 ||--|o Chebi9 : "iao_0000136"
Bao0000188 ||--|o Chebi9 : "resource_has_unit"
Bao0000188 ||--|o Chebi9 : "sio_000221"
Bao0000188 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0000189 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0000189 ||--|o Chebi9 : "iao_0000136"
Bao0000189 ||--|o Chebi9 : "resource_has_unit"
Bao0000189 ||--|o Chebi9 : "sio_000221"
Bao0000189 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0000190 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0000190 ||--|o Chebi9 : "iao_0000136"
Bao0000190 ||--|o Chebi9 : "resource_has_unit"
Bao0000190 ||--|o Chebi9 : "sio_000221"
Bao0000190 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0000192 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0000192 ||--|o Chebi9 : "iao_0000136"
Bao0000192 ||--|o Chebi9 : "resource_has_unit"
Bao0000192 ||--|o Chebi9 : "sio_000221"
Bao0000192 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0000194 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0000194 ||--|o Chebi9 : "iao_0000136"
Bao0000194 ||--|o Chebi9 : "resource_has_unit"
Bao0000194 ||--|o Chebi9 : "sio_000221"
Bao0000194 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0000349 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0000349 ||--|o Chebi9 : "iao_0000136"
Bao0000349 ||--|o Chebi9 : "resource_has_unit"
Bao0000349 ||--|o Chebi9 : "sio_000221"
Bao0000349 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0000477 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0000477 ||--|o Chebi9 : "iao_0000136"
Bao0000477 ||--|o Chebi9 : "resource_has_unit"
Bao0000477 ||--|o Chebi9 : "sio_000221"
Bao0000477 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002117 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002117 ||--|o Chebi9 : "iao_0000136"
Bao0002117 ||--|o Chebi9 : "resource_has_unit"
Bao0002117 ||--|o Chebi9 : "sio_000221"
Bao0002117 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002144 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002144 ||--|o Chebi9 : "iao_0000136"
Bao0002144 ||--|o Chebi9 : "resource_has_unit"
Bao0002144 ||--|o Chebi9 : "sio_000221"
Bao0002144 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002145 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002145 ||--|o Chebi9 : "iao_0000136"
Bao0002145 ||--|o Chebi9 : "resource_has_unit"
Bao0002145 ||--|o Chebi9 : "sio_000221"
Bao0002145 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002146 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002146 ||--|o Chebi9 : "iao_0000136"
Bao0002146 ||--|o Chebi9 : "resource_has_unit"
Bao0002146 ||--|o Chebi9 : "sio_000221"
Bao0002146 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002162 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002162 ||--|o Chebi9 : "iao_0000136"
Bao0002162 ||--|o Chebi9 : "resource_has_unit"
Bao0002162 ||--|o Chebi9 : "sio_000221"
Bao0002162 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002862 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002862 ||--|o Chebi9 : "iao_0000136"
Bao0002862 ||--|o Chebi9 : "resource_has_unit"
Bao0002862 ||--|o Chebi9 : "sio_000221"
Bao0002862 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002877 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002877 ||--|o Chebi9 : "iao_0000136"
Bao0002877 ||--|o Chebi9 : "resource_has_unit"
Bao0002877 ||--|o Chebi9 : "sio_000221"
Bao0002877 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002878 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002878 ||--|o Chebi9 : "iao_0000136"
Bao0002878 ||--|o Chebi9 : "resource_has_unit"
Bao0002878 ||--|o Chebi9 : "sio_000221"
Bao0002878 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002879 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002879 ||--|o Chebi9 : "iao_0000136"
Bao0002879 ||--|o Chebi9 : "resource_has_unit"
Bao0002879 ||--|o Chebi9 : "sio_000221"
Bao0002879 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002880 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002880 ||--|o Chebi9 : "iao_0000136"
Bao0002880 ||--|o Chebi9 : "resource_has_unit"
Bao0002880 ||--|o Chebi9 : "sio_000221"
Bao0002880 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002881 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002881 ||--|o Chebi9 : "iao_0000136"
Bao0002881 ||--|o Chebi9 : "resource_has_unit"
Bao0002881 ||--|o Chebi9 : "sio_000221"
Bao0002881 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002882 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002882 ||--|o Chebi9 : "iao_0000136"
Bao0002882 ||--|o Chebi9 : "resource_has_unit"
Bao0002882 ||--|o Chebi9 : "sio_000221"
Bao0002882 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002883 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002883 ||--|o Chebi9 : "iao_0000136"
Bao0002883 ||--|o Chebi9 : "resource_has_unit"
Bao0002883 ||--|o Chebi9 : "sio_000221"
Bao0002883 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002884 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002884 ||--|o Chebi9 : "iao_0000136"
Bao0002884 ||--|o Chebi9 : "resource_has_unit"
Bao0002884 ||--|o Chebi9 : "sio_000221"
Bao0002884 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002886 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002886 ||--|o Chebi9 : "iao_0000136"
Bao0002886 ||--|o Chebi9 : "resource_has_unit"
Bao0002886 ||--|o Chebi9 : "sio_000221"
Bao0002886 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0002887 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0002887 ||--|o Chebi9 : "iao_0000136"
Bao0002887 ||--|o Chebi9 : "resource_has_unit"
Bao0002887 ||--|o Chebi9 : "sio_000221"
Bao0002887 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
Bao0003036 ||--|o Chebi9 : "cito_cites_As_Data_Source"
Bao0003036 ||--|o Chebi9 : "iao_0000136"
Bao0003036 ||--|o Chebi9 : "resource_has_unit"
Bao0003036 ||--|o Chebi9 : "sio_000221"
Bao0003036 ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
VocabularyEndpoint ||--|o Chebi9 : "cito_cites_As_Data_Source"
VocabularyEndpoint ||--|o Chebi9 : "iao_0000136"
VocabularyEndpoint ||--|o Chebi9 : "resource_has_unit"
VocabularyEndpoint ||--|o Chebi9 : "sio_000221"
VocabularyEndpoint ||--|o Chebi9 : "vocabulary_Pub_Chem_Assay_Outcome"
In [14]:
json_path = os.path.join(exports_path, f"{dataset_name}_schema.json")
csv_path = os.path.join(exports_path, f"{dataset_name}_schema.csv")

# Export CSV from frequencies
frequencies_df.to_csv(csv_path, index=False)

# Export JSON derived from JSON-LD (maintains consistency)
with open(json_path, "w", encoding="utf-8") as fh:
    json.dump(vp.to_json(filter_void_nodes=True), fh, indent=2)

print(f"CSV exported to: {csv_path}")
print(f"JSON exported to: {json_path}")
CSV exported to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.endpoint/pubchem.endpoint_schema.csv
JSON exported to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.endpoint/pubchem.endpoint_schema.json
In [15]:
# Export collected SPARQL queries as TTL
queries_path = os.path.join(exports_path, f"{dataset_name}_sparql_queries.ttl")
queries = SparqlHelper.get_collected_queries()

if queries:
    ttl_content = SparqlHelper.export_queries_as_ttl(
        output_file=queries_path,
        base_uri=f"https://github.com/jmillanacosta/rdfsolve/sparql/{dataset_name}/",
        dataset_name=dataset_name,
    )
    print(f"Exported {len(queries)} SPARQL queries to: {queries_path}")
else:
    print("No SPARQL queries were collected")
Exported 63826 SPARQL queries to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.endpoint/pubchem.endpoint_sparql_queries.ttl